Loading libraries

library(plotly)
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(magrittr)
library(ggplot2)
library(countrycode)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
Sys.setenv(MAPBOX_TOKEN = 11122223333444) #presevents the mapbox token error
data = read.csv('owid-covid-data.csv')

#head(data)

Data preprocessing, checking on the column names and determine the important ones to help me in achieving the objectives, checking for null/NAN values.

Write the column names to a text file for my reference in the analysis

columns <- colnames(data)
file_columns<-file("columns.txt")
writeLines(c(columns), file_columns)
close(file_columns) #close the file

Check columns with the null values

#which(is.na(data))
#cols_with_na <- which(apply(data, 2, function(x) any(is.na(x))))
#colnames(data)[cols_with_na] #this indicates that atleast there are missing records in one or more rows in each feature included in the dataset.Except iso_code, continent, location and date
#colSums(is.na(data))

Visualizing the total reported cases since the start of the pandemic

df <- data
# show difference between paths and lines
p <- df %>%
  arrange(total_cases) %>%
  plot_ly(x = ~date, y = ~total_cases)
add_lines(p)

Considering the new tests

p <- df %>%
  arrange(new_tests) %>%
  plot_ly(x = ~date, y = ~new_tests) %>% 
add_lines(p)

A notable issue for the visualization is that although there were millions of reported cases, the number of new tests was relatively low, implying that not all countries could have accounted to new tests but had highest number of reported cases.

To confirm the validity of this conclusion, the following visualization digs deep into highlighting the top countries with most cases and most new tests

#group the dataset into years and months
dates <- c(df$date)
months <- month(ymd(dates))
years <- year(ymd(dates))
#new df
new_df <- data.frame(Month = months, Year = years, Continent = df$continent, total_cases = df$total_cases)

#create a plot for top 10 continents
top10 <- new_df %>% 
  group_by(Continent, Year) %>%
  summarise(totals = sum(total_cases, na.rm = TRUE)) %>%
  filter(!is.na(Continent)) %>%
  arrange(desc(totals)) %>%
  top_n(10) 
## `summarise()` has grouped output by 'Continent'. You can override using the
## `.groups` argument.
## Selecting by totals
top10 %>%
  plot_ly(x = ~Continent, y = ~totals, type = "bar")
#use the new datasets with fixed latitudes for better map visualization.
new_data <- read.csv("coronavirus.csv")
save(new_data, file = "corona.RData")
head(new_data)
##         date province country     lat      long      type cases   uid iso2 iso3
## 1 2020-01-22  Alberta  Canada 53.9333 -116.5765 confirmed     0 12401   CA  CAN
## 2 2020-01-23  Alberta  Canada 53.9333 -116.5765 confirmed     0 12401   CA  CAN
## 3 2020-01-24  Alberta  Canada 53.9333 -116.5765 confirmed     0 12401   CA  CAN
## 4 2020-01-25  Alberta  Canada 53.9333 -116.5765 confirmed     0 12401   CA  CAN
## 5 2020-01-26  Alberta  Canada 53.9333 -116.5765 confirmed     0 12401   CA  CAN
## 6 2020-01-27  Alberta  Canada 53.9333 -116.5765 confirmed     0 12401   CA  CAN
##   code3    combined_key population continent_name continent_code
## 1   124 Alberta, Canada    4413146  North America           <NA>
## 2   124 Alberta, Canada    4413146  North America           <NA>
## 3   124 Alberta, Canada    4413146  North America           <NA>
## 4   124 Alberta, Canada    4413146  North America           <NA>
## 5   124 Alberta, Canada    4413146  North America           <NA>
## 6   124 Alberta, Canada    4413146  North America           <NA>
#get the leading countries in terms of the number of cases confirmed

filtered_df <- new_data %>% 
  filter(type == "confirmed") %>%
  group_by(country) %>%
  summarise(sum_Cases = sum(cases)) %>%
  arrange(-sum_Cases)%>%
  top_n(10) #order the total in ascending order
## Selecting by sum_Cases
plt <- ggplot(filtered_df, aes(x = country, y=sum_Cases)) + geom_bar(stat="identity", fill="steelblue")+
  theme_minimal()
ggplotly(plt)
#creating a chroloplot
dates = c(new_data$date)
Date = ymd(dates)

#new_data$code3 <- countrycode(new_data$country, "country.name", "iso3c") #convert the names into 3 letter country codes

cases_summed <- new_data %>% 
  mutate(Year = year(Date), Month = month(Date)) %>%
  mutate(YearMonth = paste(format(as.Date(paste(Month, 1, Year), "%m %d %Y"), "%b %Y"), sep = " ")) %>%
  group_by(country, YearMonth, iso3) %>% 
  summarise(cases_sum = sum(cases))
## `summarise()` has grouped output by 'country', 'YearMonth'. You can override
## using the `.groups` argument.
#iso3 is used for the country codes.
cases_summed %>%
  plot_ly(z = ~cases_sum, text = ~country, locations = ~iso3, locationmode = "ISO-3", type = "choropleth") %>%
  colorbar(title = "Total Covid-reported cases ") %>%
  layout(title = "Cases by Country", geo = list(showframe = FALSE, showcoastlines = FALSE),
         coloraxis = list(colorscale = c("yellow", "red")))